In [1]:
import pymc3 as pm
import theano.tensor as tt
import numpy as np
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
np.random.seed(1337)
%matplotlib inline
In [2]:
# http://www.claudiobellei.com/2016/11/15/changepoint-frequentist/
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "http://www.claudiobellei.com/2016/11/15/changepoint-frequentist/fig1.png")
Out[2]:
In [3]:
Image(url= "http://www.claudiobellei.com/2016/11/15/changepoint-frequentist/fig2.png")
Out[3]:
In [4]:
def plot_data(data, type="ts",p=None, xlabel="Episode Number", ylabel="Rating"):
fig = plt.figure(figsize=(10,6))
n = len(data)
plt.plot(np.arange(1,n+1),data)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.ylim([0.9*np.min(data),1.1*np.max(data)])
fig.set_tight_layout(True)
plt.show()
In [5]:
# Change in standard deviation
size1 = 2500
size2 = 2500
scale1 = 50 #standard deviation of distribution function
scale2 = 100 #standard deviation of distribution function
loc1 = 1000 #mean of normal for first part
loc2 = 1020 #mean of normal for second part
d1 = norm.rvs(loc=loc1,size=size1,scale=scale1)
d2 = norm.rvs(loc=loc2,size=size2,scale=scale2)
data = np.concatenate((d1,d2),axis=0)
plot_data(data, xlabel="Time", ylabel="Response")
In [6]:
def maximum_likelihood(data):
n = data.shape[0]
switch = np.arange(1, n+1)
likelihoods = np.zeros(n)
for switchpoint in np.arange(1, n+1):
mean_1 = data[:switchpoint].mean()
std_1 = data[:switchpoint].std()
likelihood_1 = norm.logpdf(data[:switchpoint], loc=mean_1, scale=std_1).sum()
mean_2 = data[switchpoint:].mean()
std_2 = data[switchpoint:].std()
likelihood_2 = norm.logpdf(data[switchpoint:], loc=mean_2, scale=std_2).sum()
likelihoods[switchpoint-1] = likelihood_1 + likelihood_2
return likelihoods
In [7]:
likes = maximum_likelihood(data)
In [8]:
plt.plot(likes[np.isfinite(likes)])
Out[8]:
In [9]:
likes[np.isfinite(likes)].argmax()
Out[9]:
In [10]:
# Change in mean
size1 = 2500
size2 = 2500
scale1 = 50 #standard deviation of distribution function
scale2 = 75 #standard deviation of distribution function
loc1 = 535 #mean of normal for first part
loc2 = 1020 #mean of normal for second part
d1 = norm.rvs(loc=loc1,size=size1,scale=scale1)
d2 = norm.rvs(loc=loc2,size=size2,scale=scale2)
data = np.concatenate((d1,d2),axis=0)
plot_data(data, xlabel="Time", ylabel="Response")
In [11]:
likes = maximum_likelihood(data)
In [12]:
plt.plot(likes[np.isfinite(likes)])
Out[12]:
In [13]:
likes[np.isfinite(likes)].argmax()
Out[13]:
In [14]:
# %load http://www.claudiobellei.com/2016/11/15/changepoint-frequentist/changepoint.py
In [15]:
# %load http://www.claudiobellei.com/2017/01/25/changepoint-bayesian/changepoint_bayesian.py
In [16]:
# http://www.claudiobellei.com/2016/11/15/changepoint-frequentist/
# http://www.claudiobellei.com/2017/01/25/changepoint-bayesian/
In [17]:
from IPython.display import IFrame
IFrame('http://www.changepoint.info/', width=1600, height=900)
Out[17]:
In [18]:
# Available for download here https://datasets.imdbws.com/
# Description of files are here https://www.imdb.com/interfaces/
df_basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", compression="gzip", sep="\t", index_col=0)
df_episodes = pd.read_csv("https://datasets.imdbws.com/title.episode.tsv.gz", compression="gzip", sep="\t", index_col=0)
df_ratings = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", compression="gzip", sep="\t", index_col=0)
# basics = pd.read_csv("../../title.basics.tsv.gz", compression="gzip", sep="\t", index_col=0)
# episodes = pd.read_csv("../../title.episode.tsv.gz", compression="gzip", sep="\t", index_col=0)
# ratings = pd.read_csv("../../title.ratings.tsv.gz", compression="gzip", sep="\t", index_col=0)
In [19]:
df_basics.head()
Out[19]:
In [20]:
df_merged = df_basics.join(df_episodes).join(df_ratings)
In [21]:
df_merged.head()
Out[21]:
In [22]:
tvseries_basics = df_basics[df_basics["titleType"] == "tvseries"]
In [23]:
dexter = "tt0773262"
lawnorder = "tt0098844"
In [24]:
cols = ["seasonNumber", "episodeNumber", "averageRating"]
df_merged[cols] = df_merged[cols].apply(pd.to_numeric, errors='coerce')
In [25]:
colsort = ["parentTconst", "seasonNumber", "episodeNumber"]
df_merged = df_merged.sort_values(colsort)
In [26]:
df_merged = df_merged.dropna(subset=cols)
In [27]:
df_basics[df_basics.index == 'tt0084987']
Out[27]:
In [28]:
tv = df_merged[(df_merged["titleType"] == "tvEpisode") & ~(df_merged['originalTitle'].str.contains('#') & (df_merged['numVotes'] > 50))]
top_shows = tv.groupby('parentTconst').size().reset_index()
top_shows = top_shows[top_shows[0] > 200]['parentTconst'].tolist()
tv[tv['parentTconst'].isin(top_shows)].groupby('parentTconst')['averageRating'].std().reset_index().sort_values('averageRating',ascending=False)
Out[28]:
In [29]:
def plot_switchpoint(data, parentTconst):
series_data = data[data["parentTconst" ]== parentTconst]
series_data = series_data[series_data["titleType"] == "tvEpisode"]
lower = 1
upper = series_data.shape[0]
data = np.array(series_data["averageRating"].values, dtype=np.float32)
with pm.Model() as model:
switch = pm.DiscreteUniform('switch', lower=lower, upper=upper)
early_sigma = pm.HalfNormal('early_sigma', sd=2.5, testval=1.)
early_mu = pm.Uniform('early_mu', lower=data.min(), upper=data.max())
late_sigma = pm.HalfNormal('late_sigma', sd=2.5, testval=1.)
late_mu = pm.Uniform('late_mu', lower=data.min(), upper=data.max())
early_mean = pm.Normal('early_mean', mu=early_mu, sd=early_sigma)
late_mean = pm.Normal('late_mean', mu=late_mu, sd=late_sigma)
mean = tt.switch(switch >= np.arange(upper)+1, early_mean, late_mean)
ratings = pm.Normal('ratings', mu=mean, sd=1.,
observed=np.array(series_data["averageRating"].values, dtype=np.float32))
for RV in model.basic_RVs:
print(RV.name, RV.logp(model.test_point))
tr = pm.sample(10000, tune=500, cores=4)
pm.traceplot(tr)
return tr, series_data
In [30]:
def plot_trace_data(tr, data):
n = len(data)
plt.figure(figsize=(10, 8))
plt.plot(np.arange(1,n+1), data, '.')
plt.ylabel("Average Rating", fontsize=16)
plt.xlabel("Episode", fontsize=16)
plt.vlines(tr['switch'].mean(), data.min(), data.max(), color='C1')
average_data = np.zeros_like(data, dtype='float')
for i in np.arange(n):
idx = i < tr['switch']
average_data[i] = (tr['early_mean'][idx].sum() + tr['late_mean'][~idx].sum()) / (len(tr) * tr.nchains)
sp_hpd = pm.hpd(tr['switch'])
plt.fill_betweenx(y=[data.min(), data.max()],
x1=sp_hpd[0], x2=sp_hpd[1], alpha=0.5, color='C1');
plt.plot(np.arange(1,n+1), average_data, 'k--', lw=2);
plt.show()
In [31]:
# simpsons
# https://www.zinkov.com/posts/2017-11-03-simpsons-changepoint/
simpons_tt = "tt0096697"
series_data = df_merged[df_merged["parentTconst" ]== simpons_tt]
series_data = series_data[series_data["titleType"] == "tvEpisode"]
plot_data(series_data["averageRating"])
In [32]:
tr, series_data = plot_switchpoint(df_merged, simpons_tt)
In [33]:
likes = maximum_likelihood(series_data["averageRating"])
likes = likes[np.isfinite(likes)]
plt.plot(likes)
plt.show()
likes = np.exp(likes)
likes_norm = likes/likes.sum()
print(likes_norm.argmax())
plt.plot(likes_norm)
plt.show()
In [34]:
print(int(tr['switch'].mean()))
plot_trace_data(tr, series_data["averageRating"])
In [35]:
series_data.iloc[int(tr['switch'].mean()), :]
Out[35]:
In [36]:
# Law and Order SVU
tr, series_data = plot_switchpoint(df_merged, "tt0203259")
In [37]:
likes = maximum_likelihood(series_data["averageRating"])
likes = likes[np.isfinite(likes)]
plt.plot(likes)
plt.show()
likes = np.exp(likes)
likes_norm = likes/likes.sum()
print(likes_norm.argmax())
plt.plot(likes_norm)
plt.show()
In [38]:
print(int(tr['switch'].mean()))
plot_trace_data(tr, series_data["averageRating"])
In [39]:
# Dexter
tr, series_data = plot_switchpoint(df_merged, dexter)
In [40]:
likes = maximum_likelihood(series_data["averageRating"])
likes = likes[np.isfinite(likes)]
plt.plot(likes)
plt.show()
likes = np.exp(likes)
likes_norm = likes/likes.sum()
print(likes_norm.argmax())
plt.plot(likes_norm)
plt.show()
In [41]:
print(int(tr['switch'].mean()))
plot_trace_data(tr, series_data["averageRating"])
In [42]:
series_data.iloc[int(tr['switch'].mean()), :]
Out[42]:
In [43]:
apprentice_tt = "tt0364782"
tr, series_data = plot_switchpoint(df_merged, apprentice_tt)
In [44]:
likes = maximum_likelihood(series_data["averageRating"])
likes = likes[np.isfinite(likes)]
plt.plot(likes)
plt.show()
likes = np.exp(likes)
likes_norm = likes/likes.sum()
print(likes_norm.argmax())
plt.plot(likes_norm)
plt.show()
In [45]:
print(int(tr['switch'].mean()))
plot_trace_data(tr, series_data["averageRating"])
In [46]:
series_data.iloc[int(tr['switch'].mean()), :]
Out[46]:
In [47]:
def seed_maximum_likelihood(data):
n = data.shape[0]
switch = np.arange(1, n+1)
likelihoods = np.zeros(n)
mean_1s = np.zeros(n)
std_1s = np.zeros(n)
mean_2s = np.zeros(n)
std_2s = np.zeros(n)
for switchpoint in np.arange(1, n+1):
mean_1 = data[:switchpoint].mean()
std_1 = data[:switchpoint].std()
likelihood_1 = norm.logpdf(data[:switchpoint], loc=mean_1, scale=std_1).sum()
mean_1s[switchpoint-1] = mean_1
std_1s[switchpoint-1] = std_1
mean_2 = data[switchpoint:].mean()
std_2 = data[switchpoint:].std()
likelihood_2 = norm.logpdf(data[switchpoint:], loc=mean_2, scale=std_2).sum()
mean_2s[switchpoint-1] = mean_2
std_2s[switchpoint-1] = std_2
likelihoods[switchpoint-1] = likelihood_1 + likelihood_2
likelihoods = likelihoods[np.isfinite(likelihoods)]
switch = likelihoods.argmax()
return switch, mean_1s[switch], std_1s[switch], mean_2s[switch], std_2s[switch]
In [48]:
def plot_seed_switchpoint(data, parentTconst):
series_data = data[data["parentTconst" ]== parentTconst]
series_data = series_data[series_data["titleType"] == "tvEpisode"]
lower = 1
upper = series_data.shape[0]
data = np.array(series_data["averageRating"].values, dtype=np.float32)
s_switch, s_early_mean, s_early_std, s_late_mean, s_late_std = seed_maximum_likelihood(data)
with pm.Model() as model:
switch = pm.DiscreteUniform('switch', lower=lower, upper=upper, testval=s_switch)
early_sigma = pm.HalfNormal('early_sigma', sd=2.5, testval=s_early_std)
early_mu = pm.Uniform('early_mu', lower=data.min(), upper=data.max(), testval=s_early_mean)
late_sigma = pm.HalfNormal('late_sigma', sd=2.5, testval=s_late_std)
late_mu = pm.Uniform('late_mu', lower=data.min(), upper=data.max(), testval=s_late_mean)
early_mean = pm.Normal('early_mean', mu=early_mu, sd=early_sigma)
late_mean = pm.Normal('late_mean', mu=late_mu, sd=late_sigma)
mean = tt.switch(switch >= np.arange(upper)+1, early_mean, late_mean)
ratings = pm.Normal('ratings', mu=mean, sd=1.,
observed=np.array(series_data["averageRating"].values, dtype=np.float32))
for RV in model.basic_RVs:
print(RV.name, RV.logp(model.test_point))
tr = pm.sample(10000, tune=500, cores=4)
pm.traceplot(tr)
return tr, series_data
In [49]:
tr, series_data = plot_seed_switchpoint(df_merged, simpons_tt)
In [50]:
print(int(tr['switch'].mean()))
plot_trace_data(tr, series_data["averageRating"])
Maximum Likelihood Pros
Bayesian Pros
Future Work